# import raw data
diabetes <- read.csv('diabetes.csv',header = T)
# replace 0 with NA
var_with_NAs <- c("Glucose","BloodPressure","SkinThickness","Insulin","BMI","Age")
for (i in var_with_NAs){
diabetes[[i]][diabetes[[i]] == 0] <- NA
}
diabetes$Outcome[diabetes$Outcome == 0] <- 'healthy'
diabetes$Outcome[diabetes$Outcome == 1] <- 'diabetes'
# turn the outcome variable into factor
diabetes$Outcome <- as.factor(diabetes$Outcome)
# check the data structure
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 NA 70 96 ...
## $ SkinThickness : int 35 29 NA 23 35 NA 32 NA 45 NA ...
## $ Insulin : int NA NA NA 94 168 NA 88 NA 543 NA ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "diabetes","healthy": 1 2 1 2 1 2 1 2 1 1 ...
# Remove NA rows
diabetes <-diabetes %>% na.omit()
dim(diabetes)
## [1] 392 9
After remove the NA values, the samples number reduced from 768 to 392
Data preview
knitr::kable(head(diabetes))
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome
4
1
89
66
23
94
28.1
0.167
21
healthy
5
0
137
40
35
168
43.1
2.288
33
diabetes
7
3
78
50
32
88
31.0
0.248
26
diabetes
9
2
197
70
45
543
30.5
0.158
53
diabetes
14
1
189
60
23
846
30.1
0.398
59
diabetes
15
5
166
72
19
175
25.8
0.587
51
diabetes
diabetes_lm <- diabetes[, -9]
## subset based on diabetes or healthy samples
diabetes_sub <- diabetes_lm[diabetes$Outcome %in% "diabetes",]
healthy_sub <- diabetes_lm[diabetes$Outcome %in% "healthy",]
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
hist(diabetes_lm[[i]], main = i, xlab = NULL)
}
histogram <- function(x,y){
p1 <- hist(x,col=rgb(0,0,1,1/4), main = i, xlab = NULL, xlim = range(x))
p2 <- hist(y,col=rgb(1,0,0,1/4),ylim = range(y), add=T)
legend ("topright", legend = c("healthy","diabetes"),
fill = c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)), cex = 0.8)
}
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
histogram(healthy_sub[[i]], diabetes_sub[[i]])
}
We can see from the histogram that some variables have a high skewness. So we do a log2 transform for some variables.
diabetes_lm$Glucose <- log2(diabetes_lm$Glucose)
diabetes_lm$Insulin <- log2(diabetes_lm$Insulin)
diabetes_lm$BMI <- log2(diabetes_lm$BMI)
diabetes_lm$DiabetesPedigreeFunction <- log2(diabetes_lm$DiabetesPedigreeFunction)
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
hist(diabetes_lm[[i]], main = i, xlab = NULL)
}
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
histogram(healthy_sub[[i]], diabetes_sub[[i]])
}
Q-Q plot
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
qqnorm(diabetes_lm[[i]], main = i);qqline(diabetes_lm[[i]], col = "red", lwd = 2)
}
Box plot
par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
boxplot(diabetes_lm[[i]] ~ diabetes[["Outcome"]], data = diabetes, main = i, xlab = NULL, ylab = NULL, col = c(rgb(1,0,0,1/4), rgb(0,0,1,1/4)))
}
Student t-test
t_test <- function (x,y) {
var_test <- var.test(x, y, alternative = "two.sided")
if (var_test$p.value < 0.05 & abs(var_test$estimate) < 1.1) {
t.test(x, y, paired = FALSE, var.equal = TRUE, conf.level = 0.95)
}
else {
t.test(x, y, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
}
}
t_test(diabetes_sub$Glucose, healthy_sub$Glucose)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 11.151, df = 218.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 27.79385 39.72817
## sample estimates:
## mean of x mean of y
## 145.1923 111.4313
##
## Welch Two Sample t-test
##
## data: x and y
## t = 3.761, df = 237.75, p-value = 0.0002131
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.432216 7.782699
## sample estimates:
## mean of x mean of y
## 74.07692 68.96947
##
## Welch Two Sample t-test
##
## data: x and y
## t = 5.3693, df = 276.33, p-value = 1.677e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.616261 7.802999
## sample estimates:
## mean of x mean of y
## 32.96154 27.25191
t_test(diabetes_sub$Insulin, healthy_sub$Insulin)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 5.7337, df = 207.88, p-value = 3.429e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 49.86272 102.11966
## sample estimates:
## mean of x mean of y
## 206.8462 130.8550
##
## Welch Two Sample t-test
##
## data: x and y
## t = 3.8245, df = 200.74, p-value = 0.0001749
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07431668 0.23251667
## sample estimates:
## mean of x mean of y
## 0.6255846 0.4721679
t_test(diabetes_sub$BMI, healthy_sub$BMI)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 5.5571, df = 259.51, p-value = 6.797e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.599983 5.453875
## sample estimates:
## mean of x mean of y
## 35.77769 31.75076